import os
import glob
import pandas as pd  # High-performance data structures :contentReference[oaicite:3]{index=3}
import numpy as np
from pathlib import Path

# 1. 配置文件夹路径
folder = Path("Filtered_By_Media")
files = glob.glob(str(folder / "*.xlsx"))

# 验证读取到 50 个文件
print(f"Found {len(files)} media files.")

# 2. 定义函数：处理单个媒体文件
def process_media(file_path):
    media_name = Path(file_path).stem
    df = pd.read_excel(file_path, engine="openpyxl")  # 读取 Excel :contentReference[oaicite:4]{index=4}

    # 解析月份
    df['Month'] = pd.to_datetime(df['MonthYear'].astype(str), format='%Y%m', errors='coerce') \
                    .dt.to_period('M')

    # 标记 CT / TC 事件
    mask_ct = (df['Actor1CountryCode']=='CHN') & (df['Actor2CountryCode']=='TWN')
    mask_tc = (df['Actor1CountryCode']=='TWN') & (df['Actor2CountryCode']=='CHN')

    # 月度统计：事件数、提及数、加权 Goldstein/Tone 总和
    monthly = df.groupby('Month').apply(lambda g: pd.Series({
        'TotalEvents':   len(g),
        'CT_Events':     int(mask_ct[g.index].sum()),    # Mainland→Taiwan 条数
        'TC_Events':     int(mask_tc[g.index].sum()),    # Taiwan→Mainland 条数
        'TotalMentions': g['NumMentions'].sum(),         # 所有提及次数
        'CT_Mentions':   g.loc[mask_ct[g.index], 'NumMentions'].sum(),
        'TC_Mentions':   g.loc[mask_tc[g.index], 'NumMentions'].sum(),
        'ImpactRaw':     (g['GoldsteinScale'] * g['NumMentions']).sum(),
        'ToneRaw':       (g['AvgTone']       * g['NumMentions']).sum()
    }))

    # 3. 计算四项指数
    m = monthly.copy()
    # 3.1 Global Attention: log1p 平滑
    m['GlobalAttention'] = np.log1p(m['TotalMentions'])
    # 3.2 BalanceIndex: 事件占比，空月取 0.5 :contentReference[oaicite:5]{index=5}
    m['BalanceIndex'] = np.where(
        m['TotalEvents']>0,
        m['CT_Events']/m['TotalEvents'],
        0.5
    )
    # 3.3 ImpactIndex: 加权 Goldstein 归一化至 [0,1]
    m['ImpactIndex'] = np.where(
        m['TotalMentions']>0,
        (m['ImpactRaw']/m['TotalMentions'] + 10)/20,
        np.nan
    )
    # 3.4 ToneIndex: 加权 AvgTone 归一化至 [0,1]
    m['ToneIndex']   = np.where(
        m['TotalMentions']>0,
        (m['ToneRaw']  /m['TotalMentions'] + 10)/20,
        np.nan
    )

    # 重置索引并返回
    m = m.reset_index()
    m['Media'] = media_name
    return m

# 4. 对所有媒体依次处理并拼接
all_media = []
for f in files:
    dfm = process_media(f)
    all_media.append(dfm)
combined = pd.concat(all_media, ignore_index=True)  # 合并所有媒体数据 :contentReference[oaicite:6]{index=6}

# 5. 保存到 Excel：按媒体拆分 sheet
out_path = "Monthly_Indices_50_Media.xlsx"
with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
    for media_name, group in combined.groupby('Media'):
        group.drop(columns=['Media']).to_excel(writer,
                                              sheet_name=media_name[:31],  # sheet_name 最多 31 字符
                                              index=False)
print(f"✅ All indices saved into {out_path}")
